####################This code generates figure 3 and runs the simulation with State failures data####################

library(haven)

####################Set.seed####################

set.seed(213)

####################Set Working Directory to the Replication Folder####################

####################Read State-Failures Data####################

statefailure <- read_dta("sort11v3.dta")
# Requires sort11v3.dta available at https://doi.org/10.7910/DVN/BS4236 (last accessed Jun 25, 2021)
# King, Gary; Zeng, Langche, 2007, "Replication data for: Improving Forecasts of State Failure", at https://doi.org/10.7910/DVN/BS4236, Harvard Dataverse, V4, UNF:3:CEsbEgPxbxExfYuh2NWwWQ== [fileUNF]

####################Delete country codes and multiple codes for region#################### 
statefailurepure <- subset(statefailure,select = -c(sftgcode,sftgname,sftgreg,sftgmusl,sftgreg1,sftgreg2,sftgsts))

####################max number of covs to consider####################
J <- 100 

####################Number of Simulations####################
sims = 25000
# Note: 25000 simulations is time-consuming. Replicators may want to set this to 2500. 

####################Summary statistics (Table 1: Right Column)####################
isnamatrix <- is.na(statefailurepure)
prop.missing <- mean(colMeans(isnamatrix))
max.missing <- max(colMeans(isnamatrix))
min.missing <- min(colMeans(isnamatrix))
which(colMeans(isnamatrix)==min.missing)
####################79 Variables Fully Observed####################
####################Final Dataset Used Excluding Fully Missing####################

# number of variables to be dropped because fully missing
length(which(colMeans(isnamatrix)==max.missing))

statefailure_final <- subset(statefailurepure,select = -c(which(colMeans(isnamatrix)==max.missing)
))

####################Redefine the namatrix and K####################
K <- ncol(statefailure_final)
isnamatrix <- is.na(statefailure_final)

####################New max.missing####################
max.missing <- max(colMeans(isnamatrix))
which(colMeans(isnamatrix)==max.missing)
####################Table 1: Right Column####################

table1_right <- matrix(, nrow=6, ncol=1)
table1_row_names <- c("Unit of Observations", "Number of Observations","Number of Variables","Proportion of Missing Values (Avg)","Proportion of Missing Values (Max)", "Number of Variables Fully Observed")
row.names(table1_right) <- table1_row_names
colnames(table1_right) <- "StateFailures"
statefailures_column <- c("Countrie-Years",nrow(statefailure_final),ncol(statefailure_final),prop.missing,max.missing,79)
table1_right[,1] <- statefailures_column
table1_right

####################Create results matrix####################
results <- matrix(,nrow=sims, ncol=J)

####################Running Simulations####################
for(i in 1:sims){

  #################### handle case of j = 1####################
  s <- sample.int(K,1,replace=FALSE)
 
  results[i,1] <- mean(isnamatrix[,s] == 0)
  
  results[i,2:J] <- sapply(2:J,function(j) mean(rowSums(isnamatrix[,sample.int(K,j,replace=FALSE)]) == 0))
  
  
  cat(i,"")
}

####################Output results####################
write.table(results,"SIM25000_SF.csv",row.names=FALSE, sep=",")


####################Input results#################### 
results <- read.csv("SIM25000_SF.csv")

####################Expected Proportion of Data Observed####################
expected <- colMeans(results)
J <- length(expected)
expected_table_sf  <- cbind(c(1:J),c(expected)) 
colnames(expected_table_sf ) <- c("Number of Variables","Expected Proportion of Data Observed")
expected_table_sf 

####################Probability of All Rows Missing####################

prop <- colMeans(results==0)
J <- length(prop)
prop_table_sf <- cbind(c(1:J),prop)
colnames(prop_table_sf ) <- c("Number of Variables","Probability All Rows Missing")
prop_table_sf 
####################Generating Figure 2####################
#################Quartz Code is used to save the exact figure in the Mac OS environment, for windows and other environments ignore this command##################

quartz(width=8,height=4)

par(mfrow=c(1,2))

plot(y=c(1,expected),c(0:J),ylim=c(0,1),ylab="Expected Proportion Observed",xlab="Number of Variables",xlim=c(0,77),type="p", pch=20)

plot(y=c(0,prop),c(0:J),ylim=c(0,1),ylab="Probability All Rows Missing",xlab="Number of Variables",xlim=c(0,77),type="p", pch=20)

mtext("Missingness under Subsampling of Variables (State Failure Data)", side=3,line=-3,outer=TRUE)

quartz.save("sf-figure.pdf", type = "pdf", device = dev.cur(), dpi = 600)

